assess <- read_csv("anonymisedData/assessments.csv")
course <- read_csv("anonymisedData/courses.csv")
student.assess <- read_csv("anonymisedData/studentAssessment.csv")
student.info <- read_csv("anonymisedData/studentInfo.csv")
student.regio <- read_csv("anonymisedData/studentRegistration.csv")
student.vle <- read_csv("anonymisedData/studentVle.csv")
vle <- read_csv("anonymisedData/vle.csv")
table(assess$code_module) # 7 modules
##
## AAA BBB CCC DDD EEE FFF GGG
## 12 42 20 35 15 52 30
glimpse(assess)
## Observations: 206
## Variables: 6
## $ code_module <chr> "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA…
## $ code_presentation <chr> "2013J", "2013J", "2013J", "2013J", "2013J", "…
## $ id_assessment <dbl> 1752, 1753, 1754, 1755, 1756, 1757, 1758, 1759…
## $ assessment_type <chr> "TMA", "TMA", "TMA", "TMA", "TMA", "Exam", "TM…
## $ date <dbl> 19, 54, 117, 166, 215, NA, 19, 54, 117, 166, 2…
## $ weight <dbl> 10, 20, 20, 20, 30, 100, 10, 20, 20, 20, 30, 1…
glimpse(course)
## Observations: 22
## Variables: 3
## $ code_module <chr> "AAA", "AAA", "BBB", "BBB", "BBB", "B…
## $ code_presentation <chr> "2013J", "2014J", "2013J", "2014J", "…
## $ module_presentation_length <dbl> 268, 269, 268, 262, 240, 234, 269, 24…
glimpse(student.assess)
## Observations: 173,912
## Variables: 5
## $ id_assessment <dbl> 1752, 1752, 1752, 1752, 1752, 1752, 1752, 1752, 1…
## $ id_student <dbl> 11391, 28400, 31604, 32885, 38053, 45462, 45642, …
## $ date_submitted <dbl> 18, 22, 17, 26, 19, 20, 18, 19, 9, 18, 19, 18, 17…
## $ is_banked <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ score <dbl> 78, 70, 72, 69, 79, 70, 72, 72, 71, 68, 73, 67, 7…
glimpse(student.info) # students demoraphics + final results for all students
## Observations: 32,593
## Variables: 12
## $ code_module <chr> "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "…
## $ code_presentation <chr> "2013J", "2013J", "2013J", "2013J", "2013J"…
## $ id_student <dbl> 11391, 28400, 30268, 31604, 32885, 38053, 4…
## $ gender <chr> "M", "F", "F", "F", "F", "M", "M", "F", "F"…
## $ region <chr> "East Anglian Region", "Scotland", "North W…
## $ highest_education <chr> "HE Qualification", "HE Qualification", "A …
## $ imd_band <chr> "90-100%", "20-30%", "30-40%", "50-60%", "5…
## $ age_band <chr> "55<=", "35-55", "35-55", "35-55", "0-35", …
## $ num_of_prev_attempts <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ studied_credits <dbl> 240, 60, 60, 60, 60, 60, 60, 120, 90, 60, 6…
## $ disability <chr> "N", "N", "Y", "N", "N", "N", "N", "N", "N"…
## $ final_result <chr> "Pass", "Pass", "Withdrawn", "Pass", "Pass"…
glimpse(student.regio) # student registration details
## Observations: 32,593
## Variables: 5
## $ code_module <chr> "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "A…
## $ code_presentation <chr> "2013J", "2013J", "2013J", "2013J", "2013J",…
## $ id_student <dbl> 11391, 28400, 30268, 31604, 32885, 38053, 45…
## $ date_registration <dbl> -159, -53, -92, -52, -176, -110, -67, -29, -…
## $ date_unregistration <dbl> NA, NA, 12, NA, NA, NA, NA, NA, NA, NA, NA, …
glimpse(student.vle) # interaction with virtual learning envir
## Observations: 10,655,280
## Variables: 6
## $ code_module <chr> "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA…
## $ code_presentation <chr> "2013J", "2013J", "2013J", "2013J", "2013J", "…
## $ id_student <dbl> 28400, 28400, 28400, 28400, 28400, 28400, 2840…
## $ id_site <dbl> 546652, 546652, 546652, 546614, 546714, 546652…
## $ date <dbl> -10, -10, -10, -10, -10, -10, -10, -10, -10, -…
## $ sum_click <dbl> 4, 1, 1, 11, 1, 8, 2, 15, 17, 1, 1, 1, 3, 4, 3…
glimpse(vle)
## Observations: 6,364
## Variables: 6
## $ id_site <dbl> 546943, 546712, 546998, 546888, 547035, 546614…
## $ code_module <chr> "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "AAA…
## $ code_presentation <chr> "2013J", "2013J", "2013J", "2013J", "2013J", "…
## $ activity_type <chr> "resource", "oucontent", "resource", "url", "r…
## $ week_from <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ week_to <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
There are 7 modules (corses) and for each module there can be 3 assessment types (not all present in every module)
table(assess$assessment_type, assess$code_module)
##
## AAA BBB CCC DDD EEE FFF GGG
## CMA 0 15 8 7 0 28 18
## Exam 2 4 4 4 3 4 3
## TMA 10 23 8 24 12 20 9
n.modules <- student.info %>% group_by(id_student) %>% summarise(len.modules = length(code_module))
There is a total of 28785 students enrolled and a student attended a minimum of 1 to a maximum of 5.
There are demographics data and assessment results for all students that registered in a module.
sum(!(student.assess$id_student %in% student.info$id_student))
## [1] 0
sum(!(student.info$id_student %in% student.assess$id_student))
## [1] 5847
sum(!(student.info$id_student %in% student.assess$id_student))
## [1] 5847
sum(!(student.info$id_student %in% student.regio$id_student))
## [1] 0
In theory, if a student also has an unregistration date they should not have a final score for an exam for a module. Indeed, Students who unregistered have Withdrawal as the value of the final_result column in the studentInfo.csv file. However, this is not always the case and I will clean up students with a different final_result entry.
Assessment are submitted throghout the module (different dates) and if a student does not submit an assessment then there is no record.
There are 23351 students with at least an assessment available with a valid score.
All student with an assessment have demographics available.
5416 students do not have an assessment available.
There are 2711 students who did not interact with the VLE
There are 25 students with an assessment did not interact with the VLE.
sum(!(student.info$id_student %in% student.vle$id_student))
## [1] 2852
sum(!(student.assess$id_student %in% student.vle$id_student))
## [1] 77
vle.combine <- vle %>% left_join(student.vle) %>% dplyr::rename(date.vle = date)
# Connect assessment types scores with scores
ass <- student.assess %>% left_join(assess)
summary(ass$score)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 65.0 80.0 75.8 90.0 100.0 173
table(sign(ass$date_submitted), ass$code_module, ass$code_presentation)
## , , = 2013B
##
##
## AAA BBB CCC DDD EEE FFF GGG
## -1 0 40 0 58 0 59 0
## 0 0 0 0 1 0 5 0
## 1 0 11216 0 10314 0 12131 0
##
## , , = 2013J
##
##
## AAA BBB CCC DDD EEE FFF GGG
## -1 0 270 0 172 0 194 0
## 0 0 3 0 0 0 3 0
## 1 1633 14102 0 7764 2884 16043 5950
##
## , , = 2014B
##
##
## AAA BBB CCC DDD EEE FFF GGG
## -1 0 194 0 74 1 156 0
## 0 0 7 0 2 0 0 0
## 1 0 9792 7489 4466 1779 10040 4896
##
## , , = 2014J
##
##
## AAA BBB CCC DDD EEE FFF GGG
## -1 42 37 137 278 39 290 16
## 0 0 30 0 0 0 10 0
## 1 1474 7341 11314 7735 3190 15884 4357
table(sign(ass$date_submitted), ass$assessment_type)
##
## CMA Exam TMA
## -1 641 0 1416
## 0 18 0 43
## 1 69868 4959 96967
ass$date.sign <- sign(ass$date_submitted)
ggplot(ass, aes(x = factor(date.sign), y = score, fill = code_module)) + geom_boxplot()
ass <- ass %>% filter(!is.na(score))
There are 0 NAs assessment scores and 2055 assignments were submitted before the start of the course. Given the large number and my limited knolwledge about the dataset I will keep them in the study. They might be entry tests. However, I will remove the NAs assessment score.
On the online resource it says: date – information about the final submission date of the assessment calculated as the number of days SINCE the start of the module-presentation. The starting date of the presentation has number 0 (zero). How can it be negative??
Still after filtering this leaves me with 173739 assessments across 23351 students.
# Now I know which student what id assessment did.
combine <- student.info %>% # code_module , code_pres, id_student, demo, final score.
left_join(student.regio) %>%
left_join(course)
summary(ass$score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 65.0 80.0 75.8 90.0 100.0
ass.total <- ass %>% group_by(code_module, code_presentation, id_student) %>%
summarise(comb.score = mean(score))
click.total <- vle.combine %>% group_by(code_module, code_presentation, id_student) %>%
summarise(total.clicks = sum(sum_click))
# Merge assessment and clicks
click.score <- ass.total %>% left_join(click.total)
In this first analysis I am interested in understanding what are the factors that influence withdrawal (1 = withdrawl, 0 = PASS/FAIL/Distinction) and Success (1 = PASS/Distinction, 0 = FAIL). The factors influencing withdrawal could be very different from the factors leading to failure, e.g. external factors versus student’s capabilities.
I will create a total weigthed score for the assigment and exams scores available in each module. Some students, if they withdrew do not have score for a module so there will be NAs once combining this meta-score with student infos. I also created a total score for the number of clicks in the VLE done in one module in each semester. I will also create a variable with total number of students in each module.
All students who unregistered during the course could only be Withdrawn as final result since they left the course.
However, it appears that some students that never unregistered still withdrawn from the module. Also, Some students never registered but still they unregistered or they are in course??
combine$Unregistered <- ifelse(is.na(combine$date_unregistration), "in course",
"unregistered")
combine$Registered <- ifelse(is.na(combine$date_registration), "not registered",
"registered")
sum.reg <- combine %>% group_by(Registered, Unregistered, final_result) %>%
summarise(n.students = length(unique(id_student)))
knitr::kable(sum.reg, caption = "Number of students by registration date, unregistration date and final result.")
| Registered | Unregistered | final_result | n.students |
|---|---|---|---|
| not registered | in course | Fail | 4 |
| not registered | in course | Pass | 1 |
| not registered | unregistered | Withdrawn | 38 |
| registered | in course | Distinction | 2903 |
| registered | in course | Fail | 6799 |
| registered | in course | Pass | 11880 |
| registered | in course | Withdrawn | 93 |
| registered | unregistered | Fail | 9 |
| registered | unregistered | Withdrawn | 9043 |
combine <- combine %>% filter(!(Registered %in% "not registered"), !(Unregistered %in%
"unregistered" & Registered %in% "registered" & final_result %in% "Fail"))
We have now 28748 students.
dim(combine)
## [1] 32539 17
combine.feat <- combine %>% left_join(click.score)
dim(combine.feat)
## [1] 32539 19
nas <- apply(combine.feat, 2, function(x) sum(is.na(x)))
nas
## code_module code_presentation
## 0 0
## id_student gender
## 0 0
## region highest_education
## 0 0
## imd_band age_band
## 1111 0
## num_of_prev_attempts studied_credits
## 0 0
## disability final_result
## 0 0
## date_registration date_unregistration
## 0 22515
## module_presentation_length Unregistered
## 0 0
## Registered comb.score
## 0 6725
## total.clicks
## 6775
table(combine.feat$highest_education)
##
## A Level or Equivalent HE Qualification
## 14022 4723
## Lower Than A Level No Formal quals
## 13135 346
## Post Graduate Qualification
## 313
table(combine.feat$num_of_prev_attempts)
##
## 0 1 2 3 4 5 6
## 28376 3292 674 142 38 13 4
combine.feat <- combine.feat %>% mutate(withdraw = ifelse(final_result %in%
"Withdrawn", 1, 0), imd_band_num = as.numeric(as.factor(imd_band)), highest_education = factor(highest_education,
levels = c("No Formal quals", "Lower Than A Level", "A Level or Equivalent",
"HE Qualification", "Post Graduate Qualification")), num_of_prev_attempts_dich = ifelse(num_of_prev_attempts >
0, ">0", "0"), year = str_remove(code_presentation, pattern = "B|J"), semester = str_extract(code_presentation,
pattern = "B|J"), code_presentation = factor(code_presentation, levels = c("2013B",
"2014B", "2013J", "2014J")))
repl <- names(table(combine.feat$id_student)[table(combine.feat$id_student) >
1])
repl.student <- combine.feat[combine.feat$id_student %in% repl, ]
repl.student.sum <- repl.student %>% group_by(id_student) %>% summarise(n.modules = length(unique(code_module)),
n.results = length(unique(final_result)), n.repl = length(code_module))
ggplot(repl.student.sum, aes(x = factor(n.modules), fill = factor(n.results))) +
geom_bar(position = "dodge") + facet_wrap(~n.repl) + theme_bw()
student.to.keep <- combine.feat %>% group_by(code_module, id_student) %>% summarise(code_presentation = code_presentation[which.max(num_of_prev_attempts)]) %>%
unite(key.student.module, code_module, id_student, code_presentation, sep = "-",
remove = FALSE)
combine.feat.unique <- combine.feat %>% unite(key.student.module, code_module,
id_student, code_presentation, sep = "-", remove = FALSE) %>% filter(key.student.module %in%
student.to.keep$key.student.module)
combine.feat.unique <- combine.feat.unique[!duplicated(combine.feat.unique$id_student),
]
nrow(combine.feat.unique)/nrow(combine.feat) # removed 12% of the initial data
## [1] 0.8834937
# Total number of students attending a course - includign students who
# attempted different modules
nstudents <- combine.feat %>% group_by(code_module, code_presentation) %>% summarise(n.students = length(unique(id_student)))
combine.feat.unique <- combine.feat.unique %>% left_join(nstudents)
sum.reg <- combine.feat.unique %>% group_by(Registered, Unregistered, final_result) %>%
summarise(n.students = length(unique(id_student)))
colnames(sum.reg) <- c("Registered", "Unregistered", "Final result", "Number of students")
knitr::kable(sum.reg, caption = "Number of students by registration date, unregistration date and final result.")
| Registered | Unregistered | Final result | Number of students |
|---|---|---|---|
| registered | in course | Distinction | 2684 |
| registered | in course | Fail | 6548 |
| registered | in course | Pass | 11119 |
| registered | in course | Withdrawn | 91 |
| registered | unregistered | Withdrawn | 8306 |
In 2014 there was an increased activity on the VLe which is probably why it helped improving success.
vle.combine.sum <- vle.combine %>% group_by(code_module, code_presentation,
activity_type, id_student) %>% summarise(sum_click = sum(sum_click)) %>%
unite(key.student.module, code_module, id_student, code_presentation, sep = "-",
remove = FALSE) %>% filter(key.student.module %in% student.to.keep$key.student.module) %>%
mutate(year = str_remove(code_presentation, pattern = "B|J"), semester = str_extract(code_presentation,
pattern = "B|J")) %>% left_join(nstudents)
nstudents.year.semester <- combine.feat %>% group_by(year, semester) %>% summarise(n.students = length(unique(id_student)))
vle.combine.sum.sum <- vle.combine.sum %>% group_by(activity_type, year, semester) %>%
summarise(n.clicks = sum(sum_click)) %>% ungroup() %>% left_join(nstudents.year.semester) %>%
mutate(n.clicks.std = n.clicks/n.students) %>% mutate(activity_type = fct_reorder(activity_type,
n.clicks.std))
ggplot(vle.combine.sum.sum, aes(x = activity_type, y = n.clicks.std, fill = year)) +
geom_bar(stat = "identity", position = "dodge") + facet_wrap(~semester) +
coord_flip() + theme_bw()
vle.combine.top5 <- vle.combine.sum %>% filter(activity_type %in% c("oucontent",
"forumng", "homepage", "quiz", "subpage"))
vle.combine.sum.sum <- vle.combine.top5 %>% group_by(activity_type, semester,
code_module, n.students) %>% summarise(n.clicks = sum(sum_click)/unique(n.students)) %>%
unite(key, activity_type, semester, code_module, sep = "-", remove = FALSE) %>%
ungroup() %>% mutate(activity_type = fct_reorder(activity_type, n.clicks))
ggplot(vle.combine.sum.sum, aes(x = code_module, y = n.clicks, fill = semester)) +
geom_bar(stat = "identity", position = "dodge", colour = "white") + facet_wrap(~activity_type) +
theme_bw() + coord_flip()
# Sum of clicks done by each student per activity type
click.students.activity <- vle.combine.sum %>% filter(activity_type %in% c("oucontent",
"forumng", "homepage", "quiz", "subpage")) %>% group_by(code_module, year,
semester, activity_type, id_student) %>% summarise(n.clicks.act = sum(sum_click))
n.students.vle <- click.students.activity %>% group_by(code_module, year, semester) %>%
summarise(n.students.clicks = length(unique(id_student)))
click.students.activity <- click.students.activity %>% left_join(n.students.vle) %>%
mutate(clicks.per.student = n.clicks.act/n.students.clicks)
sum.students <- combine.feat %>% group_by(code_presentation, code_module) %>%
summarise(n.students = length(unique(id_student))) %>% ungroup() %>% mutate(year = str_remove(code_presentation,
pattern = "B|J"), semester = str_extract(code_presentation, pattern = "B|J"),
code_presentation = factor(code_presentation, levels = c("2013B", "2014B",
"2013J", "2014J"))) %>% dplyr::rename(`Code presentation` = code_presentation)
# There seem to be a semester effect in the number students, with more
# students enrolling in the J
ggplot(sum.students, aes(x = code_module, y = n.students, fill = `Code presentation`)) +
geom_bar(stat = "identity", position = "dodge", colour = "white") + theme_bw() +
ggtitle("Number of students enrolled in each module") + theme(legend.position = "bottom") +
labs(x = "Code of module", y = "Number of student enrolled")
Number of students enrolled in each module across years and presentations. The figure included students who enrolled multiple time (same or different module).
# There doesn't seem to be a year effect
ggplot(sum.students, aes(x = code_module, y = n.students, fill = year)) + facet_wrap(~semester) +
geom_bar(stat = "identity", position = "dodge", colour = "white") + theme_bw() +
ggtitle("Number of students enrolled in each module")
ggplot(combine.feat.unique, aes(x = comb.score, fill = final_result)) + geom_density() +
theme_bw()
We defined the success rate as the proportion of students that either passed a module with Distinction or Pass out of all the students that registered into a module but excluding students who Withdrawn. Conversely, we defined the dropout rate as the proportion of students that Withdrawn from a module out of all the students that registered into any module. The reasons that push a student to dropout from a module could be very different from the reasons that led to failure.
combine.feat.unique <- combine.feat.unique %>% mutate(pass_fail = ifelse(final_result %in%
c("Fail"), 0, 1)) %>% mutate(pass_fail = ifelse(final_result %in% "Withdrawn",
NA, pass_fail))
table(combine.feat.unique$pass_fail, combine.feat.unique$final_result)
##
## Distinction Fail Pass Withdrawn
## 0 0 6548 0 0
## 1 2684 0 11119 0
table(combine.feat.unique$withdraw, combine.feat.unique$final_result)
##
## Distinction Fail Pass Withdrawn
## 0 2684 6548 11119 0
## 1 0 0 0 8397
# Poeple that withdraw tend to have a slightly lower socio economic statis
ggplot(combine.feat.unique,aes(x = factor(withdraw), y = imd_band_num)) + geom_boxplot() + facet_wrap(~semester) + theme_bw()
# Scotland, wales and ireland are the ones with lower levels of widthdrawal
sum.region.with <- combine.feat.unique %>%
group_by(region) %>%
summarise(imd.mean = mean(imd_band_num,na.rm = TRUE),
sem.imd.mean = sd(imd_band_num,na.rm = TRUE)/sqrt(length(imd_band_num[!is.na(imd_band_num)])),
with.mean = mean(withdraw),
sem.with.mean = sd(withdraw)/sqrt(length(withdraw))) %>%
mutate(region = fct_reorder(region,with.mean))
ggplot(sum.region.with,aes(x = with.mean, y = imd.mean,colour=region)) + geom_point(stat = "identity") + theme_bw() +
geom_errorbar(aes(ymin = imd.mean - sem.imd.mean,
ymax = imd.mean + sem.imd.mean)) +
geom_errorbarh(aes( xmin = with.mean - sem.with.mean,
xmax = with.mean + sem.with.mean)) +
geom_label_repel(aes(label = region)) +
geom_hline(yintercept = mean(combine.feat.unique$imd_band_num,na.rm = TRUE),linetype="dotted") + ylim(c(3.5,7.5)) +
geom_vline(xintercept = mean(combine.feat.unique$withdraw,na.rm = TRUE),linetype="dotted")
I will use module BBB as reference for the module, it’s around the mean of withdrawal
The module with higheest prop of withdrawal are CCC/DDD and the ones with less withdrawal are AAA and GGG. The socio econoic area does not seem to be too heterogenous apart from module AAA.
sum.region <- combine.feat.unique %>%
group_by(code_module,semester,year) %>%
summarise(imd.mean = mean(imd_band_num,na.rm = TRUE),
sem.imd.mean = sd(imd_band_num,na.rm = TRUE)/sqrt(length(imd_band_num[!is.na(imd_band_num)])),
with.mean = mean(withdraw),
sem.with.mean = sd(withdraw)/sqrt(length(withdraw))) %>%
ungroup() %>%
mutate(code_module = fct_reorder(code_module,with.mean))
ggplot(sum.region,aes(x = with.mean, y = imd.mean,colour=code_module,shape=year)) + geom_point(stat = "identity",size=2) + theme_bw() +
geom_errorbar(aes(ymin = imd.mean - sem.imd.mean,
ymax = imd.mean + sem.imd.mean)) +
geom_errorbarh(aes( xmin = with.mean - sem.with.mean,
xmax = with.mean + sem.with.mean)) +
geom_label_repel(aes(label = code_module)) + facet_wrap(~semester) +
geom_hline(yintercept = mean(combine.feat.unique$imd_band_num,na.rm = TRUE),linetype="dotted") + ylim(c(3.5,7.5))+
geom_vline(xintercept = mean(combine.feat.unique$withdraw,na.rm = TRUE),linetype="dotted")
sum.region <- combine.feat.unique %>% group_by(code_module, semester, year) %>%
summarise(with.mean = mean(withdraw), sem.with.mean = sd(withdraw)/sqrt(length(withdraw))) %>%
ungroup() %>% mutate(code_module = fct_reorder(code_module, with.mean))
ggplot(sum.region, aes(x = code_module, y = with.mean, colour = code_module,
shape = year)) + geom_point(stat = "identity", size = 2) + theme_bw() +
geom_errorbar(aes(ymin = with.mean - sem.with.mean, ymax = with.mean + sem.with.mean),
width = 0.5) + facet_wrap(~semester) + geom_hline(yintercept = mean(combine.feat.unique$withdraw,
na.rm = TRUE), linetype = "dotted")
# IMD bands
imd.band <- unique(combine.feat.unique[,c("imd_band","imd_band_num")]) %>%
filter(!is.na(imd_band)) %>%
mutate(imd_band = ifelse(imd_band %in% "10-20","10-20%",imd_band),
imd.mean = imd_band_num)
# Scotland, wales and ireland are the ones with lower levels of widthdrawal
sum.region.pass.fail <- combine.feat.unique %>%
group_by(region) %>%
summarise(imd.mean = round(mean(imd_band_num,na.rm = TRUE)),
sem.imd.mean = sd(imd_band_num,na.rm = TRUE)/sqrt(length(imd_band_num[!is.na(imd_band_num)])),
with.mean = mean(withdraw,na.rm = TRUE),
sem.with.mean = sd(withdraw,na.rm = TRUE)/sqrt(length(withdraw[!is.na(withdraw)])),
pass.mean = mean(pass_fail,na.rm = TRUE),
sem.pass.mean = sd(pass_fail,na.rm = TRUE)/sqrt(length(pass_fail[!is.na(pass_fail)]))) %>%
mutate(region = fct_reorder(region,pass.mean)) %>%
left_join(imd.band) %>%
mutate(Region = paste0(region," (IMD",imd_band,")")) %>%
dplyr::rename(IMD_band = "imd_band")
plot.region =ggplot(sum.region.pass.fail,aes(x = pass.mean, y = with.mean,colour=IMD_band)) + geom_point() + theme_classic() +
geom_vline(xintercept = mean(combine.feat.unique$pass_fail,na.rm = TRUE),linetype="dotted",colour="grey") +
geom_hline(yintercept = mean(combine.feat.unique$withdraw,na.rm = TRUE),linetype="dotted",colour="grey") +
geom_label_repel(aes(label = Region),show.legend = FALSE,size=2) + labs(x = "Success rate", y = "Dropout rate") + theme(legend.position = "bottom")
I will use module BBB as reference for the module, it’s around the mean of withdrawal
The module with higheest prop of withdrawal are CCC/DDD and the ones with less withdrawal are AAA and GGG. The socio econoic area does not seem to be too heterogenous apart from module AAA.
sum.module.with <- combine.feat.unique %>% group_by(code_module) %>% summarise(with.mean = mean(withdraw,
na.rm = TRUE), sem.with.mean = sd(withdraw, na.rm = TRUE)/sqrt(length(withdraw[!is.na(withdraw)]))) %>%
ungroup() %>% mutate(code_module = fct_reorder(code_module, with.mean))
# Scotland, wales and ireland are the ones with lower levels of widthdrawal
sum.region.pass.fail <- combine.feat.unique %>% group_by(code_module) %>% summarise(pass.mean = mean(pass_fail,
na.rm = TRUE), sem.pass.mean = sd(pass_fail, na.rm = TRUE)/sqrt(length(pass_fail[!is.na(pass_fail)])),
imd.mean = round(mean(imd_band_num, na.rm = TRUE))) %>% mutate(code_module = fct_reorder(code_module,
pass.mean)) %>% left_join(sum.module.with) %>% left_join(imd.band) %>% mutate(Module = paste0(code_module,
" (IMD", imd_band, ")")) %>% dplyr::rename(IMD_band = imd_band)
data.overlay <- data.frame(success.rate = c(mean(combine.feat.unique$pass_fail,
na.rm = TRUE), 0.2), dropout.rate = c(0.2, mean(combine.feat.unique$withdraw,
na.rm = TRUE)), label = c("Average success rate", "Average dropout rate"))
plot.module = ggplot(sum.region.pass.fail, aes(x = pass.mean, y = with.mean,
colour = IMD_band)) + geom_point() + theme_classic() + geom_vline(xintercept = mean(combine.feat.unique$pass_fail,
na.rm = TRUE), linetype = "dotted", colour = "grey") + geom_hline(yintercept = mean(combine.feat.unique$withdraw,
na.rm = TRUE), linetype = "dotted", colour = "grey") + geom_label_repel(aes(label = Module),
show.legend = FALSE, size = 2) + labs(x = "Success rate", y = "Dropout rate") +
theme(legend.position = "bottom") + xlim(0.55, 0.9) + ylim(0.1, 0.45)
legend <- get_legend(plot.region)
pc1 = plot_grid(plot.region + ggtitle("Success/dropout rate\nby region") + theme(legend.position = "none") +
xlim(0.55, 0.9) + ylim(0.1, 0.45), plot.module + ggtitle("Success/dropout rate\nby module") +
theme(legend.position = "none") + xlim(0.55, 0.9) + ylim(0.1, 0.45), nrow = 1)
figure1 = plot_grid(pc1)
figure1
ggsave("output/success-dropout-region-module.pdf", width = 12, height = 6)
sum.module.with <- combine.feat.unique %>% group_by(code_module, semester, year) %>%
summarise(with.mean = mean(withdraw, na.rm = TRUE), sem.with.mean = sd(withdraw,
na.rm = TRUE)/sqrt(length(withdraw[!is.na(withdraw)]))) %>% ungroup() %>%
mutate(code_module = fct_reorder(code_module, with.mean))
sum.region.pass.fail <- combine.feat.unique %>% group_by(code_module, semester,
year) %>% summarise(pass.mean = mean(pass_fail, na.rm = TRUE), sem.pass.mean = sd(pass_fail,
na.rm = TRUE)/sqrt(length(pass_fail[!is.na(pass_fail)]))) %>% ungroup() %>%
mutate(code_module = fct_reorder(code_module, pass.mean)) %>% left_join(sum.module.with)
ggplot(sum.region.pass.fail, aes(x = pass.mean, y = with.mean, colour = code_module,
shape = year)) + geom_point(stat = "identity", size = 2) + theme_bw() +
geom_text_repel(aes(label = code_module)) + facet_wrap(~semester) + geom_hline(yintercept = mean(combine.feat.unique$withdraw,
na.rm = TRUE), linetype = "dotted") + geom_vline(xintercept = mean(combine.feat.unique$pass_fail,
na.rm = TRUE), linetype = "dotted")
ggsave("output/dropout-success-module-semester-year.png", width = 10, height = 10)
# Scotland, wales and ireland are the ones with lower levels of widthdrawal
sum.edu.pass.fail <- combine.feat.unique %>%
group_by(highest_education) %>%
summarise(with.mean = mean(withdraw,na.rm = TRUE),
sem.with.mean = sd(withdraw,na.rm = TRUE)/sqrt(length(withdraw[!is.na(withdraw)])),
pass.mean = mean(pass_fail,na.rm = TRUE),
sem.pass.mean = sd(pass_fail,na.rm = TRUE)/sqrt(length(pass_fail[!is.na(pass_fail)])),
imd.mean = round(mean(imd_band_num,na.rm = TRUE))) %>%
mutate(highest_education = fct_reorder(highest_education,pass.mean)) %>%
left_join(imd.band) %>%
mutate(Highest_education = paste0(highest_education," (IMD",imd_band,")")) %>%
dplyr::rename(IMD_band = imd_band)
p4 = ggplot(sum.edu.pass.fail,aes(x = pass.mean, y = with.mean,colour=IMD_band)) + geom_point() + theme_classic() +
geom_vline(xintercept = mean(combine.feat.unique$pass_fail,na.rm = TRUE),linetype="dotted",colour="grey") +
geom_hline(yintercept = mean(combine.feat.unique$withdraw,na.rm = TRUE),linetype="dotted",colour="grey") +
geom_text_repel(aes(label = Highest_education),show.legend = FALSE,size=2) + labs(x = "Success rate", y = "Dropout rate") + theme(legend.position = "bottom")
col.props <- data.frame(prop.table(table(combine.feat.unique$final_result, combine.feat.unique$imd_band),
margin = 2)) %>% dplyr::rename(`Final result` = Var1) %>% mutate(`Final result` = factor(`Final result`,
levels = c("Distinction", "Pass", "Fail", "Withdrawn"))) %>% mutate(Var2 = factor(Var2,
levels = c("0-10%", "10-20", "20-30%", "30-40%", "40-50%", "50-60%", "60-70%",
"70-80%", "80-90%", "90-100%"), labels = c(c("0-10", "10-20", "20-30",
"30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"))))
plot.imd = ggplot(col.props, aes(x = Var2, y = Freq * 100, colour = `Final result`,
group = `Final result`)) + geom_point() + geom_line() + theme_bw() + labs(x = "Band of Index of Multiple Depravation (%)",
y = "% Students") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(axis.text.x = element_text(angle = 5, hjust = 0.5, size = 7), legend.key.size = unit(0.3,
"cm"), legend.key.width = unit(0.05, "cm"), axis.title = element_text(size = 9),
title = element_text(size = 10), legend.position = "bottom")
col.props <- data.frame(prop.table(table(combine.feat.unique$highest_education,
combine.feat.unique$age_band), margin = 2))
p2 = ggplot(col.props, aes(x = Var2, y = Freq * 100, colour = Var1, group = Var1)) +
geom_point() + geom_line() + theme_bw() + labs(x = "Age class", y = "% Students") +
theme(legend.position = "bottom")
# education
col.props <- data.frame(prop.table(table(combine.feat.unique$final_result, combine.feat.unique$highest_education),
margin = 2)) %>% dplyr::rename(`Final result` = Var1) %>% mutate(`Final result` = factor(`Final result`,
levels = c("Distinction", "Pass", "Fail", "Withdrawn"))) %>% mutate(Var2 = factor(Var2,
levels = levels(Var2), labels = c("No formal qual", "< Than A", "A or equivalent",
"HE", "Post Graduate")))
plot.edu = ggplot(col.props, aes(x = Var2, y = Freq * 100, colour = `Final result`,
group = `Final result`)) + geom_point() + geom_line() + theme_bw() + labs(x = "Highest education",
y = "% Students") + theme(legend.position = "bottom") + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + theme(axis.text.x = element_text(angle = 5, hjust = 0.5, size = 7),
legend.key.size = unit(0.3, "cm"), legend.key.width = unit(0.05, "cm"),
axis.title = element_text(size = 9), title = element_text(size = 10))
p = plot_grid(plot.imd + theme(legend.position = "none"), plot.edu + theme(legend.position = "none"),
nrow = 1)
legend <- get_legend(plot.imd)
plot_grid(p, legend, nrow = 2, rel_heights = c(5, 1))
ggsave("output/imd-education-final-result.pdf", width = 8, height = 3)
col.props <- data.frame(prop.table(table(combine.feat.unique$imd_band, combine.feat.unique$highest_education),
margin = 2)) %>% mutate(Var1 = factor(Var1, levels = c("0-10%", "10-20",
"20-30%", "30-40%", "40-50%", "50-60%", "60-70%", "70-80%", "80-90%", "90-100%"),
labels = c(c("0-10%", "10-20%", "20-30%", "30-40%", "40-50%", "50-60%",
"60-70%", "70-80%", "80-90%", "90-100%")))) %>% dplyr::rename(`IMD band` = Var1) %>%
mutate(Var2 = factor(Var2, levels = levels(Var2), labels = c("No formal qual",
"< Than A", "A or equivalent", "HE", "Post Graduate")))
plot.imd.edu <- ggplot(col.props, aes(x = Var2, y = Freq * 100, colour = `IMD band`,
group = `IMD band`)) + geom_point() + geom_line() + theme_bw() + labs(x = "Highest education",
y = "% Students") + ggtitle("Qualification by IMD band") + theme(axis.text.x = element_text(angle = 5,
hjust = 0.5, size = 7), legend.key.size = unit(0.3, "cm"), legend.key.width = unit(0.05,
"cm"), axis.title = element_text(size = 9), title = element_text(size = 10))
# Scotland, wales and ireland are the ones with lower levels of widthdrawal
sum.edu.pass.fail <- combine.feat.unique %>%
group_by(gender) %>%
summarise(with.mean = mean(withdraw,na.rm = TRUE),
sem.with.mean = sd(withdraw,na.rm = TRUE)/sqrt(length(withdraw[!is.na(withdraw)])),
pass.mean = mean(pass_fail,na.rm = TRUE),
sem.pass.mean = sd(pass_fail,na.rm = TRUE)/sqrt(length(pass_fail[!is.na(pass_fail)])),
imd.mean = round(mean(imd_band_num,na.rm = TRUE))) %>%
mutate(gender = fct_reorder(gender,pass.mean)) %>%
dplyr::rename(Success = pass.mean,
Dropout = with.mean) %>%
dplyr::select(gender,Success,Dropout) %>%
gather(key = Result, value = Proportion,Success,Dropout)
plot.gender=ggplot(sum.edu.pass.fail,aes(x = gender, y = Proportion,fill=Result)) + geom_bar(stat="identity",position = "dodge") + theme_classic()+
theme(axis.text.x = element_text(angle = 5, hjust = 0.5,size=7),
legend.key.size = unit(0.3, "cm"),
axis.title = element_text(size=9),
title = element_text(size=10))
plot_grid(plot.imd.edu, plot.gender + ggtitle("Dropout/Success rate by gender"),
nrow = 1)
ggsave("output/imd-by-education.pdf", width = 8, height = 3)
combine.feat.clicks <- combine.feat.unique %>% left_join(click.students.activity) %>%
mutate(final_result = factor(final_result, levels = c("Distinction", "Pass",
"Fail", "Withdrawn")))
freq.results <- combine.feat.clicks %>% group_by(id_student, final_result) %>%
summarise(n.clicks = round(mean(n.clicks.act, na.rm = TRUE))) %>% mutate(VLE = ifelse(is.na(n.clicks),
"Never used VLE", "Used VLE")) %>% group_by(VLE, final_result) %>% summarise(n = n())
plot.freq.vle = ggplot(freq.results, aes(x = final_result, y = n, fill = VLE)) +
geom_bar(stat = "identity", position = "dodge") + theme_classic() + geom_text(aes(label = round(n)),
hjust = 0.5, vjust = -1, size = 2, position = position_dodge(width = 0.9)) +
labs(x = "Final result", y = "Number of students") + theme(axis.text.x = element_text(angle = 0,
hjust = 0.5, size = 6), axis.text.y = element_text(angle = 0, hjust = 0.5,
size = 6), legend.key.size = unit(0.3, "cm"), legend.key.width = unit(0.3,
"cm"), legend.text = element_text(size = 5), axis.title = element_text(size = 6),
title = element_text(size = 8), legend.position = "top")
plot.freq.vle
ggsave("output/usedVLE-by-result.pdf", width = 3, height = 3)
freq.results <- combine.feat.clicks %>% group_by(id_student, final_result) %>%
summarise(n.clicks = mean(n.clicks.act, na.rm = TRUE)) %>% mutate(VLE = ifelse(is.na(n.clicks),
"Never used VLE", "Used VLE"))
# Chi square
chisq.test(table(freq.results$final_result, freq.results$VLE))
##
## Pearson's Chi-squared test
##
## data: table(freq.results$final_result, freq.results$VLE)
## X-squared = 5684.8, df = 3, p-value < 2.2e-16
n.final.result <- combine.feat.unique %>% group_by(final_result) %>% summarise(n = n())
# sum all clicks done for each result and activity type and divide by number
# of students in each result category
result.clicks <- combine.feat.clicks %>% group_by(final_result, activity_type) %>%
left_join(n.final.result) %>% summarise(sum.clicks = round(sum(n.clicks.act,
na.rm = TRUE))/unique(n)) %>% filter(!is.na(activity_type)) %>% ungroup() %>%
mutate(activity_type = fct_reorder(activity_type, sum.clicks)) %>% mutate(final_result = factor(final_result,
levels = c("Distinction", "Pass", "Fail", "Withdrawn"))) %>% filter(!(activity_type %in%
c("ouwiki", "resource")))
top5 = ggplot(result.clicks, aes(x = final_result, y = sum.clicks, fill = activity_type)) +
geom_bar(stat = "identity", position = "dodge", colour = "white") + theme_classic() +
theme(legend.position = "bottom") + labs(x = "Final result", y = "Sum of clicks per activity type") +
geom_text(aes(label = round(sum.clicks)), hjust = 0.5, vjust = -1, size = 2,
position = position_dodge(width = 0.9)) + theme(axis.text.x = element_text(angle = 0,
hjust = 0.5, size = 6), axis.text.y = element_text(angle = 0, hjust = 0.5,
size = 6), legend.key.size = unit(0.3, "cm"), legend.key.width = unit(0.3,
"cm"), axis.title = element_text(size = 6), title = element_text(size = 8),
legend.position = "bottom")
ggdraw() + draw_plot(top5 + theme(legend.position = "bottom"), 0, 0, 1, 1) +
draw_plot(plot.freq.vle + theme(legend.position = "top"), x = 0.53, y = 0.55,
width = 0.5, height = 0.4, scale = 0.9) + draw_plot_label(c("A", "B"),
c(0, 0.58), c(1, 0.92), size = 10)
ggsave("output/usedVLE-by-result-top5.pdf", width = 5, height = 4)
# correlation
combine.feat.unique.num <- combine.feat.clicks %>% spread(key = activity_type,
value = n.clicks.act) %>% mutate_at(.vars = c("oucontent", "quiz", "homepage",
"subpage"), .funs = function(x) log(x)) %>% dplyr::select(pass_fail, oucontent,
quiz, homepage, subpage, disability, forumng, semester, year, gender, code_module) %>%
mutate_all(.funs = function(x) as.numeric(as.factor(x)))
corrplot::corrplot(cor(combine.feat.unique.num, use = "pairwise.complete.obs"))
combine.feat.unique.num <- combine.feat.clicks %>% spread(key = activity_type,
value = n.clicks.act)
apply(combine.feat.unique.num, 2, function(x) sum(is.na(x)))
## key.student.module code_module
## 0 0
## code_presentation id_student
## 0 0
## gender region
## 0 0
## highest_education imd_band
## 0 4255
## age_band num_of_prev_attempts
## 0 0
## studied_credits disability
## 0 0
## final_result date_registration
## 0 0
## date_unregistration module_presentation_length
## 91644 0
## Unregistered Registered
## 0 0
## comb.score total.clicks
## 12094 12140
## withdraw imd_band_num
## 0 4255
## num_of_prev_attempts_dich year
## 0 0
## semester n.students
## 0 0
## pass_fail n.students.clicks
## 25237 2994
## clicks.per.student forumng
## 2994 93754
## homepage oucontent
## 90881 92986
## quiz subpage
## 98758 91718
## <NA>
## 116606
# remove homepage
m <- glm(pass_fail ~ disability + log(oucontent) + semester + year + gender +
code_module, data = combine.feat.unique.num, family = binomial)
car::vif(m)
## GVIF Df GVIF^(1/(2*Df))
## disability 1.007914 1 1.003949
## log(oucontent) 2.522731 1 1.588311
## semester 1.084754 1 1.041515
## year 1.182951 1 1.087636
## gender 1.547445 1 1.243963
## code_module 3.985378 6 1.122120
tidym <- tidy(m, conf.int = TRUE)
tidym <- tidym %>% mutate(term = factor(term, levels = term)) %>% filter(term !=
"(Intercept)")
ggplot(tidym, aes(x = term, y = estimate, colour = -log10(p.value))) + geom_point() +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high)) + theme_bw() + geom_hline(yintercept = 0,
linetype = "dotted") + coord_flip() + ggtitle("Clicks associated with success vs failure")
sum.region <- combine.feat.clicks %>%
group_by(code_module,semester) %>%
summarise(clicks.mean = mean(n.clicks.act,na.rm = TRUE),
sem.clicks.mean = sd(n.clicks.act,na.rm = TRUE)/sqrt(length(n.clicks.act[!is.na(n.clicks.act)])),
with.mean = mean(withdraw),
sem.with.mean = sd(withdraw)/sqrt(length(withdraw))) %>%
ungroup() %>%
mutate(code_module = fct_reorder(code_module,with.mean))
ggplot(sum.region,aes(x = with.mean, y = clicks.mean,colour=code_module)) + geom_point(stat = "identity") + theme_bw() +
geom_errorbar(aes(ymin = clicks.mean - sem.clicks.mean,
ymax = clicks.mean + sem.clicks.mean)) +
geom_errorbarh(aes( xmin = with.mean - sem.with.mean,
xmax = with.mean + sem.with.mean)) +
geom_label_repel(aes(label = code_module)) + facet_wrap(~semester) + labs(x="Dropout rate",y="Mean of clicks")
# centered on the mean
combine.feat.unique$comb.score.std <- scale(combine.feat.unique$comb.score,center = TRUE,scale = TRUE)
combine.feat.unique$total.clicks.std <- scale(log(combine.feat.unique$total.clicks),center = TRUE,scale = TRUE)
combine.feat.unique$studied_credits.std <- scale(log(combine.feat.unique$studied_credits),center = TRUE,scale = TRUE)
par(mfrow=c(2,2))
hist(combine.feat.unique$comb.score.std)
hist(combine.feat.unique$total.clicks.std)
hist(combine.feat.unique$studied_credits.std)
# Recode modules
regions <- names(table(combine.feat.unique$region))[!(names(table(combine.feat.unique$region)) %in% "South West Region")]
combine.feat.unique <- combine.feat.unique %>%
mutate(code_module = factor(code_module,
levels = c("BBB","AAA","CCC","DDD","EEE","FFF","GGG")),
region = factor(region, levels=c("South West Region",regions)))
# No imd
m <- glm(withdraw ~ gender + region + highest_education + studied_credits.std +
disability + module_presentation_length + code_module + semester + year +
total.clicks.std + n.students + num_of_prev_attempts_dich + comb.score.std,
data = combine.feat.unique, family = binomial)
car::vif(m)
## GVIF Df GVIF^(1/(2*Df))
## gender 1.488549 1 1.220061
## region 1.153646 12 1.005973
## highest_education 1.180044 4 1.020910
## studied_credits.std 1.238350 1 1.112812
## disability 1.030869 1 1.015317
## module_presentation_length 41.619049 1 6.451283
## code_module 151.757253 6 1.519716
## semester 47.462224 1 6.889283
## year 1.229218 1 1.108701
## total.clicks.std 1.487774 1 1.219743
## n.students 37.680235 1 6.138423
## num_of_prev_attempts_dich 1.092203 1 1.045085
## comb.score.std 1.190036 1 1.090888
A few independent variables show quite high VIF which is symptoms of multicollinearity. This makes sense because the number of students and length of a course are correlated with the module.
ggplot(combine.feat.unique, aes(x = code_module, y = n.students, fill = semester)) +
geom_bar(stat = "identity", position = "dodge") + theme_bw()
n.students and module_presentation_lengthm <- glm(withdraw ~ gender + region + highest_education + studied_credits.std +
disability + code_module + semester + year + total.clicks.std + num_of_prev_attempts_dich +
comb.score.std + imd_band_num, data = combine.feat.unique, family = binomial)
car::vif(m) # looks better
## GVIF Df GVIF^(1/(2*Df))
## gender 1.500246 1 1.224845
## region 1.293690 12 1.010787
## highest_education 1.182264 4 1.021149
## studied_credits.std 1.243820 1 1.115267
## disability 1.035426 1 1.017559
## code_module 3.113794 6 1.099278
## semester 1.027967 1 1.013887
## year 1.172279 1 1.082718
## total.clicks.std 1.477427 1 1.215495
## num_of_prev_attempts_dich 1.090450 1 1.044246
## comb.score.std 1.189781 1 1.090771
## imd_band_num 1.176506 1 1.084669
summary(m)
##
## Call:
## glm(formula = withdraw ~ gender + region + highest_education +
## studied_credits.std + disability + code_module + semester +
## year + total.clicks.std + num_of_prev_attempts_dich + comb.score.std +
## imd_band_num, family = binomial, data = combine.feat.unique)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8468 -0.5496 -0.3740 -0.2312 3.2960
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -2.880549 0.234868 -12.265
## genderM -0.331087 0.050414 -6.567
## regionEast Anglian Region -0.043346 0.096038 -0.451
## regionEast Midlands Region 0.035555 0.103088 0.345
## regionIreland -0.224889 0.139314 -1.614
## regionLondon Region -0.182070 0.099028 -1.839
## regionNorth Region 0.130146 0.131614 0.989
## regionNorth Western Region -0.077487 0.099717 -0.777
## regionScotland -0.082472 0.096673 -0.853
## regionSouth East Region 0.060541 0.106342 0.569
## regionSouth Region 0.040304 0.097552 0.413
## regionWales -0.317189 0.108236 -2.931
## regionWest Midlands Region 0.023064 0.102013 0.226
## regionYorkshire Region -0.044575 0.109551 -0.407
## highest_educationLower Than A Level -0.108241 0.210938 -0.513
## highest_educationA Level or Equivalent -0.455266 0.211385 -2.154
## highest_educationHE Qualification -0.323237 0.217010 -1.490
## highest_educationPost Graduate Qualification -0.943426 0.396998 -2.376
## studied_credits.std 0.150688 0.023956 6.290
## disabilityY 0.316934 0.065284 4.855
## code_moduleAAA 0.577660 0.152994 3.776
## code_moduleCCC 1.744103 0.078017 22.355
## code_moduleDDD 1.143262 0.069550 16.438
## code_moduleEEE 0.991637 0.119761 8.280
## code_moduleFFF 1.723407 0.077907 22.121
## code_moduleGGG -0.967554 0.125629 -7.702
## semesterJ 0.109779 0.042983 2.554
## year2014 0.309914 0.046808 6.621
## total.clicks.std -1.014747 0.024861 -40.816
## num_of_prev_attempts_dich0 0.366722 0.062056 5.910
## comb.score.std -0.196460 0.019897 -9.874
## imd_band_num -0.021280 0.007914 -2.689
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## genderM 5.12e-11 ***
## regionEast Anglian Region 0.65175
## regionEast Midlands Region 0.73017
## regionIreland 0.10647
## regionLondon Region 0.06598 .
## regionNorth Region 0.32274
## regionNorth Western Region 0.43712
## regionScotland 0.39360
## regionSouth East Region 0.56915
## regionSouth Region 0.67950
## regionWales 0.00338 **
## regionWest Midlands Region 0.82113
## regionYorkshire Region 0.68409
## highest_educationLower Than A Level 0.60785
## highest_educationA Level or Equivalent 0.03126 *
## highest_educationHE Qualification 0.13635
## highest_educationPost Graduate Qualification 0.01748 *
## studied_credits.std 3.17e-10 ***
## disabilityY 1.21e-06 ***
## code_moduleAAA 0.00016 ***
## code_moduleCCC < 2e-16 ***
## code_moduleDDD < 2e-16 ***
## code_moduleEEE < 2e-16 ***
## code_moduleFFF < 2e-16 ***
## code_moduleGGG 1.34e-14 ***
## semesterJ 0.01065 *
## year2014 3.57e-11 ***
## total.clicks.std < 2e-16 ***
## num_of_prev_attempts_dich0 3.43e-09 ***
## comb.score.std < 2e-16 ***
## imd_band_num 0.00717 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19524 on 21944 degrees of freedom
## Residual deviance: 15666 on 21913 degrees of freedom
## (6803 observations deleted due to missingness)
## AIC: 15730
##
## Number of Fisher Scoring iterations: 5
tidym.combined <- tidy(m, conf.int = TRUE) %>% mutate(term = factor(term, levels = term)) %>%
filter(term != "(Intercept)") %>% mutate(OR = exp(estimate))
knitr::kable(tidym.combined)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high | OR |
|---|---|---|---|---|---|---|---|
| genderM | -0.3310866 | 0.0504144 | -6.5673057 | 0.0000000 | -0.4298784 | -0.2322407 | 0.7181430 |
| regionEast Anglian Region | -0.0433456 | 0.0960380 | -0.4513378 | 0.6517461 | -0.2313052 | 0.1452697 | 0.9575804 |
| regionEast Midlands Region | 0.0355552 | 0.1030878 | 0.3449019 | 0.7301681 | -0.1666407 | 0.2375880 | 1.0361948 |
| regionIreland | -0.2248888 | 0.1393143 | -1.6142541 | 0.1064723 | -0.5013415 | 0.0451573 | 0.7986051 |
| regionLondon Region | -0.1820697 | 0.0990277 | -1.8385743 | 0.0659778 | -0.3760514 | 0.0122457 | 0.8335432 |
| regionNorth Region | 0.1301461 | 0.1316142 | 0.9888454 | 0.3227388 | -0.1298580 | 0.3863345 | 1.1389948 |
| regionNorth Western Region | -0.0774865 | 0.0997171 | -0.7770638 | 0.4371211 | -0.2728155 | 0.1181822 | 0.9254395 |
| regionScotland | -0.0824725 | 0.0966732 | -0.8531059 | 0.3936006 | -0.2716398 | 0.1074171 | 0.9208368 |
| regionSouth East Region | 0.0605412 | 0.1063422 | 0.5693053 | 0.5691490 | -0.1482262 | 0.2687701 | 1.0624114 |
| regionSouth Region | 0.0403038 | 0.0975523 | 0.4131504 | 0.6794964 | -0.1506620 | 0.2318489 | 1.0411270 |
| regionWales | -0.3171893 | 0.1082359 | -2.9305372 | 0.0033838 | -0.5299166 | -0.1054878 | 0.7281929 |
| regionWest Midlands Region | 0.0230638 | 0.1020129 | 0.2260874 | 0.8211334 | -0.1769180 | 0.2230882 | 1.0233319 |
| regionYorkshire Region | -0.0445755 | 0.1095514 | -0.4068913 | 0.6840879 | -0.2598490 | 0.1697382 | 0.9564034 |
| highest_educationLower Than A Level | -0.1082411 | 0.2109380 | -0.5131417 | 0.6078522 | -0.5120407 | 0.3165136 | 0.8974112 |
| highest_educationA Level or Equivalent | -0.4552661 | 0.2113847 | -2.1537330 | 0.0312611 | -0.8600151 | -0.0297061 | 0.6342792 |
| highest_educationHE Qualification | -0.3232372 | 0.2170098 | -1.4895049 | 0.1363545 | -0.7395213 | 0.1127737 | 0.7238022 |
| highest_educationPost Graduate Qualification | -0.9434264 | 0.3969983 | -2.3763990 | 0.0174825 | -1.7588437 | -0.1919502 | 0.3892917 |
| studied_credits.std | 0.1506879 | 0.0239562 | 6.2901360 | 0.0000000 | 0.1036840 | 0.1975991 | 1.1626338 |
| disabilityY | 0.3169341 | 0.0652836 | 4.8547259 | 0.0000012 | 0.1882020 | 0.4441604 | 1.3729121 |
| code_moduleAAA | 0.5776604 | 0.1529942 | 3.7757024 | 0.0001596 | 0.2705053 | 0.8710313 | 1.7818648 |
| code_moduleCCC | 1.7441031 | 0.0780168 | 22.3554932 | 0.0000000 | 1.5915883 | 1.8974398 | 5.7207684 |
| code_moduleDDD | 1.1432620 | 0.0695498 | 16.4380339 | 0.0000000 | 1.0071560 | 1.2798200 | 3.1369845 |
| code_moduleEEE | 0.9916368 | 0.1197612 | 8.2801179 | 0.0000000 | 0.7540120 | 1.2237643 | 2.6956431 |
| code_moduleFFF | 1.7234065 | 0.0779074 | 22.1212181 | 0.0000000 | 1.5711438 | 1.8765674 | 5.6035848 |
| code_moduleGGG | -0.9675535 | 0.1256292 | -7.7016633 | 0.0000000 | -1.2182596 | -0.7252782 | 0.3800116 |
| semesterJ | 0.1097790 | 0.0429835 | 2.5539832 | 0.0106498 | 0.0256775 | 0.1941863 | 1.1160314 |
| year2014 | 0.3099141 | 0.0468082 | 6.6209398 | 0.0000000 | 0.2183026 | 0.4018090 | 1.3633080 |
| total.clicks.std | -1.0147472 | 0.0248615 | -40.8160512 | 0.0000000 | -1.0637216 | -0.9662574 | 0.3624940 |
| num_of_prev_attempts_dich0 | 0.3667222 | 0.0620562 | 5.9095147 | 0.0000000 | 0.2459014 | 0.4891943 | 1.4429970 |
| comb.score.std | -0.1964604 | 0.0198968 | -9.8739570 | 0.0000000 | -0.2354804 | -0.1574771 | 0.8216339 |
| imd_band_num | -0.0212797 | 0.0079137 | -2.6889900 | 0.0071669 | -0.0368022 | -0.0057784 | 0.9789451 |
ggplot(tidym.combined, aes(x = term, y = estimate)) + geom_point() + geom_errorbar(aes(ymin = conf.low,
ymax = conf.high)) + theme_bw() + geom_hline(yintercept = 0, linetype = "dotted") +
coord_flip() + ggtitle("Factors associated with withdrawal")
aug1 <- cbind(combine.feat.unique, predict(m, newdata = combine.feat.unique,
type = "link", se = TRUE))
pairs(aug1[, c("fit", "total.clicks.std", "studied_credits.std")])
aug1 <- cbind(combine.feat.unique, predict(m, newdata = combine.feat.unique,
type = "response", se = TRUE))
ggplot(aug1, aes(x = fit, fill = factor(withdraw))) + geom_histogram(alpha = 0.5) +
theme_bw()
# Chisquare t-test
HLgof.test(fit = aug1$fit[!is.na(aug1$fit)], obs = aug1$withdraw[!is.na(aug1$fit)])
## $C
##
## Hosmer-Lemeshow C statistic
##
## data: aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 112.38, df = 8, p-value < 2.2e-16
##
##
## $H
##
## Hosmer-Lemeshow H statistic
##
## data: aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 421.26, df = 8, p-value < 2.2e-16
# Roc curve
aug1 <- cbind(combine.feat.unique, predict(m, newdata = combine.feat.unique,
type = "response", se = TRUE))
roc.log <- ggplot(aug1, aes(d = withdraw, m = fit)) + geom_roc() + theme_bw()
roc.log + ggtitle("Withdrawal - combined semesters") + annotate("text", x = 0.75,
y = 0.25, label = paste("AUC =", round(calc_auc(roc.log)$AUC, 2))) + geom_abline(sslope = 1,
intercept = 0, linetype = "dotted") + scale_x_continuous("1 - Specificity",
breaks = seq(0, 1, by = 0.1))
m <- glm(pass_fail ~ gender + region + highest_education + studied_credits.std +
disability + code_module + semester + year + total.clicks.std + num_of_prev_attempts_dich +
comb.score.std + imd_band_num, data = combine.feat.unique, family = binomial)
car::vif(m) # looks better
## GVIF Df GVIF^(1/(2*Df))
## gender 1.651181 1 1.284983
## region 1.339386 12 1.012250
## highest_education 1.177187 4 1.020600
## studied_credits.std 1.581345 1 1.257515
## disability 1.028493 1 1.014146
## code_module 5.429016 6 1.151401
## semester 1.058644 1 1.028904
## year 1.154200 1 1.074337
## total.clicks.std 1.783593 1 1.335512
## num_of_prev_attempts_dich 1.075493 1 1.037060
## comb.score.std 1.132933 1 1.064393
## imd_band_num 1.170450 1 1.081873
summary(m)
##
## Call:
## glm(formula = pass_fail ~ gender + region + highest_education +
## studied_credits.std + disability + code_module + semester +
## year + total.clicks.std + num_of_prev_attempts_dich + comb.score.std +
## imd_band_num, family = binomial, data = combine.feat.unique)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4413 -0.2840 0.3116 0.5779 3.3900
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 1.179180 0.253092 4.659
## genderM 0.011299 0.055401 0.204
## regionEast Anglian Region -0.137171 0.103076 -1.331
## regionEast Midlands Region -0.095360 0.112182 -0.850
## regionIreland -0.182945 0.138227 -1.324
## regionLondon Region -0.177716 0.104911 -1.694
## regionNorth Region -0.064199 0.147565 -0.435
## regionNorth Western Region -0.331789 0.107265 -3.093
## regionScotland -0.493826 0.104328 -4.733
## regionSouth East Region 0.060673 0.118565 0.512
## regionSouth Region 0.098294 0.108015 0.910
## regionWales -0.405618 0.112010 -3.621
## regionWest Midlands Region -0.168068 0.110114 -1.526
## regionYorkshire Region -0.222582 0.116761 -1.906
## highest_educationLower Than A Level -0.201374 0.229537 -0.877
## highest_educationA Level or Equivalent 0.358984 0.229907 1.561
## highest_educationHE Qualification 0.341736 0.236268 1.446
## highest_educationPost Graduate Qualification 0.088385 0.378960 0.233
## studied_credits.std 0.059441 0.027609 2.153
## disabilityY -0.071637 0.074886 -0.957
## code_moduleAAA 0.209489 0.164701 1.272
## code_moduleCCC -1.298808 0.094268 -13.778
## code_moduleDDD -0.905796 0.075381 -12.016
## code_moduleEEE -1.579899 0.115367 -13.695
## code_moduleFFF -2.526675 0.088552 -28.533
## code_moduleGGG 0.072901 0.095531 0.763
## semesterJ 0.439185 0.045080 9.742
## year2014 0.405214 0.046507 8.713
## total.clicks.std 1.763172 0.035822 49.221
## num_of_prev_attempts_dich0 0.279031 0.064036 4.357
## comb.score.std 1.098220 0.030293 36.254
## imd_band_num 0.033028 0.008342 3.959
## Pr(>|z|)
## (Intercept) 3.18e-06 ***
## genderM 0.838394
## regionEast Anglian Region 0.183260
## regionEast Midlands Region 0.395296
## regionIreland 0.185666
## regionLondon Region 0.090269 .
## regionNorth Region 0.663520
## regionNorth Western Region 0.001980 **
## regionScotland 2.21e-06 ***
## regionSouth East Region 0.608843
## regionSouth Region 0.362819
## regionWales 0.000293 ***
## regionWest Midlands Region 0.126935
## regionYorkshire Region 0.056611 .
## highest_educationLower Than A Level 0.380321
## highest_educationA Level or Equivalent 0.118421
## highest_educationHE Qualification 0.148068
## highest_educationPost Graduate Qualification 0.815583
## studied_credits.std 0.031321 *
## disabilityY 0.338761
## code_moduleAAA 0.203397
## code_moduleCCC < 2e-16 ***
## code_moduleDDD < 2e-16 ***
## code_moduleEEE < 2e-16 ***
## code_moduleFFF < 2e-16 ***
## code_moduleGGG 0.445396
## semesterJ < 2e-16 ***
## year2014 < 2e-16 ***
## total.clicks.std < 2e-16 ***
## num_of_prev_attempts_dich0 1.32e-05 ***
## comb.score.std < 2e-16 ***
## imd_band_num 7.52e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 21908 on 18364 degrees of freedom
## Residual deviance: 13701 on 18333 degrees of freedom
## (10383 observations deleted due to missingness)
## AIC: 13765
##
## Number of Fisher Scoring iterations: 5
tidym.combined <- tidy(m, conf.int = TRUE) %>% mutate(term = factor(term, levels = term)) %>%
filter(term != "(Intercept)")
ggplot(tidym.combined, aes(x = term, y = estimate)) + geom_point() + geom_errorbar(aes(ymin = conf.low,
ymax = conf.high)) + theme_bw() + geom_hline(yintercept = 0, linetype = "dotted") +
coord_flip() + ggtitle("Factors associated with withdrawal")
aug1 <- cbind(combine.feat.unique, predict(m, newdata = combine.feat.unique,
type = "link", se = TRUE))
pairs(aug1[, c("fit", "total.clicks.std", "studied_credits.std")])
aug1 <- cbind(combine.feat.unique, predict(m, newdata = combine.feat.unique,
type = "response", se = TRUE))
ggplot(aug1, aes(x = fit, fill = factor(withdraw))) + geom_histogram(alpha = 0.5) +
theme_bw()
# Chisquare t-test
HLgof.test(fit = aug1$fit[!is.na(aug1$fit)], obs = aug1$withdraw[!is.na(aug1$fit)])
## $C
##
## Hosmer-Lemeshow C statistic
##
## data: aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 276170, df = 8, p-value < 2.2e-16
##
##
## $H
##
## Hosmer-Lemeshow H statistic
##
## data: aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 181090, df = 8, p-value < 2.2e-16
# Roc curve
aug1 <- cbind(combine.feat.unique, predict(m, newdata = combine.feat.unique,
type = "response", se = TRUE))
roc.log <- ggplot(aug1, aes(d = withdraw, m = fit)) + geom_roc() + theme_bw()
roc.log + ggtitle("Withdrawal - combined semesters") + annotate("text", x = 0.75,
y = 0.25, label = paste("AUC =", round(calc_auc(roc.log)$AUC, 2))) + geom_abline(sslope = 1,
intercept = 0, linetype = "dotted") + scale_x_continuous("1 - Specificity",
breaks = seq(0, 1, by = 0.1))
semB <- subset(combine.feat.unique, semester %in% "B")
m <- glm(withdraw ~ gender + region + highest_education + studied_credits.std +
disability + code_module + year + total.clicks.std + num_of_prev_attempts_dich +
comb.score.std + imd_band_num, data = semB, family = binomial)
car::vif(m) # oooks better
## GVIF Df GVIF^(1/(2*Df))
## gender 1.549976 1 1.244980
## region 1.300428 12 1.011006
## highest_education 1.175302 4 1.020396
## studied_credits.std 1.438013 1 1.199172
## disability 1.035898 1 1.017791
## code_module 3.960373 5 1.147555
## year 1.293596 1 1.137364
## total.clicks.std 1.618914 1 1.272365
## num_of_prev_attempts_dich 1.284724 1 1.133456
## comb.score.std 1.406154 1 1.185813
## imd_band_num 1.194279 1 1.092831
summary(m)
##
## Call:
## glm(formula = withdraw ~ gender + region + highest_education +
## studied_credits.std + disability + code_module + year + total.clicks.std +
## num_of_prev_attempts_dich + comb.score.std + imd_band_num,
## family = binomial, data = semB)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6309 -0.5559 -0.3925 -0.2479 2.9628
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -2.3198772 0.3298462 -7.033
## genderM -0.3448773 0.0831752 -4.146
## regionEast Anglian Region -0.0913176 0.1513791 -0.603
## regionEast Midlands Region 0.1375766 0.1608466 0.855
## regionIreland -0.0504522 0.2382082 -0.212
## regionLondon Region -0.2162457 0.1550372 -1.395
## regionNorth Region -0.2380043 0.2286610 -1.041
## regionNorth Western Region -0.0981280 0.1585165 -0.619
## regionScotland -0.1374326 0.1593822 -0.862
## regionSouth East Region -0.0008147 0.1714789 -0.005
## regionSouth Region -0.1665937 0.1576403 -1.057
## regionWales -0.4714802 0.1775146 -2.656
## regionWest Midlands Region -0.0681321 0.1617427 -0.421
## regionYorkshire Region -0.0396765 0.1730472 -0.229
## highest_educationLower Than A Level -0.4382973 0.2901558 -1.511
## highest_educationA Level or Equivalent -0.8023334 0.2911118 -2.756
## highest_educationHE Qualification -0.6809019 0.3022786 -2.253
## highest_educationPost Graduate Qualification -2.7591176 1.0648012 -2.591
## studied_credits.std 0.0178575 0.0423624 0.422
## disabilityY 0.3045867 0.1056286 2.884
## code_moduleCCC 1.5328302 0.1369064 11.196
## code_moduleDDD 1.2213514 0.1148741 10.632
## code_moduleEEE 0.9265971 0.2674343 3.465
## code_moduleFFF 1.5918358 0.1274617 12.489
## code_moduleGGG -1.0484223 0.2091110 -5.014
## year2014 0.0053164 0.0785677 0.068
## total.clicks.std -0.9312455 0.0422302 -22.052
## num_of_prev_attempts_dich0 0.3862757 0.1064487 3.629
## comb.score.std -0.2547609 0.0349667 -7.286
## imd_band_num 0.0017447 0.0129230 0.135
## Pr(>|z|)
## (Intercept) 2.02e-12 ***
## genderM 3.38e-05 ***
## regionEast Anglian Region 0.546350
## regionEast Midlands Region 0.392370
## regionIreland 0.832264
## regionLondon Region 0.163077
## regionNorth Region 0.297940
## regionNorth Western Region 0.535890
## regionScotland 0.388532
## regionSouth East Region 0.996209
## regionSouth Region 0.290605
## regionWales 0.007907 **
## regionWest Midlands Region 0.673582
## regionYorkshire Region 0.818650
## highest_educationLower Than A Level 0.130901
## highest_educationA Level or Equivalent 0.005849 **
## highest_educationHE Qualification 0.024287 *
## highest_educationPost Graduate Qualification 0.009564 **
## studied_credits.std 0.673359
## disabilityY 0.003932 **
## code_moduleCCC < 2e-16 ***
## code_moduleDDD < 2e-16 ***
## code_moduleEEE 0.000531 ***
## code_moduleFFF < 2e-16 ***
## code_moduleGGG 5.34e-07 ***
## year2014 0.946051
## total.clicks.std < 2e-16 ***
## num_of_prev_attempts_dich0 0.000285 ***
## comb.score.std 3.20e-13 ***
## imd_band_num 0.892604
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7194.3 on 8162 degrees of freedom
## Residual deviance: 5923.1 on 8133 degrees of freedom
## (2651 observations deleted due to missingness)
## AIC: 5983.1
##
## Number of Fisher Scoring iterations: 6
tidyB <- tidy(m, conf.int = TRUE) %>% mutate(term = factor(term, levels = term)) %>%
filter(term != "(Intercept)")
ggplot(tidyB, aes(x = term, y = estimate)) + geom_point() + geom_errorbar(aes(ymin = conf.low,
ymax = conf.high)) + theme_bw() + geom_hline(yintercept = 0, linetype = "dotted") +
coord_flip() + ggtitle("Factors associated with withdrawal")
aug1 <- cbind(semB, predict(m, newdata = semB, type = "response", se = TRUE))
ggplot(aug1, aes(x = fit, fill = factor(withdraw))) + geom_histogram(alpha = 0.5) +
theme_bw()
HLgof.test(fit = aug1$fit[!is.na(aug1$fit)], obs = aug1$withdraw[!is.na(aug1$fit)])
## $C
##
## Hosmer-Lemeshow C statistic
##
## data: aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 52.208, df = 8, p-value = 1.534e-08
##
##
## $H
##
## Hosmer-Lemeshow H statistic
##
## data: aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 165.94, df = 8, p-value < 2.2e-16
semJ <- subset(combine.feat.unique, semester %in% "J")
m <- glm(withdraw ~ gender + region + highest_education + studied_credits.std +
disability + code_module + year + total.clicks.std + num_of_prev_attempts_dich +
comb.score.std + imd_band_num, data = semJ, family = binomial)
car::vif(m) # oooks better
## GVIF Df GVIF^(1/(2*Df))
## gender 1.473383 1 1.213830
## region 1.313810 12 1.011437
## highest_education 1.189169 4 1.021893
## studied_credits.std 1.232645 1 1.110245
## disability 1.038408 1 1.019023
## code_module 2.888944 6 1.092433
## year 1.163116 1 1.078479
## total.clicks.std 1.442007 1 1.200836
## num_of_prev_attempts_dich 1.067219 1 1.033063
## comb.score.std 1.147195 1 1.071072
## imd_band_num 1.165175 1 1.079433
summary(m)
##
## Call:
## glm(formula = withdraw ~ gender + region + highest_education +
## studied_credits.std + disability + code_module + year + total.clicks.std +
## num_of_prev_attempts_dich + comb.score.std + imd_band_num,
## family = binomial, data = semJ)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0559 -0.5406 -0.3575 -0.2178 3.3807
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -3.220820 0.340757 -9.452
## genderM -0.311627 0.063823 -4.883
## regionEast Anglian Region 0.008632 0.124906 0.069
## regionEast Midlands Region -0.000650 0.134808 -0.005
## regionIreland -0.258924 0.173611 -1.491
## regionLondon Region -0.172297 0.129660 -1.329
## regionNorth Region 0.337058 0.163682 2.059
## regionNorth Western Region -0.042584 0.129008 -0.330
## regionScotland -0.012856 0.122959 -0.105
## regionSouth East Region 0.109048 0.136554 0.799
## regionSouth Region 0.198870 0.125264 1.588
## regionWales -0.206181 0.137723 -1.497
## regionWest Midlands Region 0.106182 0.132399 0.802
## regionYorkshire Region -0.017462 0.142660 -0.122
## highest_educationLower Than A Level 0.220210 0.311740 0.706
## highest_educationA Level or Equivalent -0.117983 0.312117 -0.378
## highest_educationHE Qualification 0.014535 0.318247 0.046
## highest_educationPost Graduate Qualification -0.143236 0.486977 -0.294
## studied_credits.std 0.229344 0.030399 7.545
## disabilityY 0.326086 0.083523 3.904
## code_moduleAAA 0.608535 0.160047 3.802
## code_moduleCCC 1.821713 0.097862 18.615
## code_moduleDDD 1.025501 0.089300 11.484
## code_moduleEEE 1.045374 0.138493 7.548
## code_moduleFFF 1.757835 0.099957 17.586
## code_moduleGGG -0.841453 0.160022 -5.258
## year2014 0.508166 0.060214 8.439
## total.clicks.std -1.075798 0.031327 -34.341
## num_of_prev_attempts_dich0 0.313237 0.079850 3.923
## comb.score.std -0.171248 0.024985 -6.854
## imd_band_num -0.034820 0.010077 -3.455
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## genderM 1.05e-06 ***
## regionEast Anglian Region 0.944907
## regionEast Midlands Region 0.996153
## regionIreland 0.135857
## regionLondon Region 0.183900
## regionNorth Region 0.039473 *
## regionNorth Western Region 0.741332
## regionScotland 0.916729
## regionSouth East Region 0.424538
## regionSouth Region 0.112375
## regionWales 0.134374
## regionWest Midlands Region 0.422558
## regionYorkshire Region 0.902578
## highest_educationLower Than A Level 0.479946
## highest_educationA Level or Equivalent 0.705424
## highest_educationHE Qualification 0.963573
## highest_educationPost Graduate Qualification 0.768657
## studied_credits.std 4.54e-14 ***
## disabilityY 9.46e-05 ***
## code_moduleAAA 0.000143 ***
## code_moduleCCC < 2e-16 ***
## code_moduleDDD < 2e-16 ***
## code_moduleEEE 4.41e-14 ***
## code_moduleFFF < 2e-16 ***
## code_moduleGGG 1.45e-07 ***
## year2014 < 2e-16 ***
## total.clicks.std < 2e-16 ***
## num_of_prev_attempts_dich0 8.75e-05 ***
## comb.score.std 7.18e-12 ***
## imd_band_num 0.000550 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12328.7 on 13781 degrees of freedom
## Residual deviance: 9638.4 on 13751 degrees of freedom
## (4152 observations deleted due to missingness)
## AIC: 9700.4
##
## Number of Fisher Scoring iterations: 6
tidyJ <- tidy(m, conf.int = TRUE) %>% mutate(term = factor(term, levels = term)) %>%
filter(term != "(Intercept)")
ggplot(tidyJ, aes(x = term, y = estimate)) + geom_point() + geom_errorbar(aes(ymin = conf.low,
ymax = conf.high)) + theme_bw() + geom_hline(yintercept = 0, linetype = "dotted") +
coord_flip() + ggtitle("Factors associated with withdrawal - J")
aug1 <- cbind(semJ, predict(m, newdata = semJ, type = "response", se = TRUE))
ggplot(aug1, aes(x = fit, fill = factor(withdraw))) + geom_histogram(alpha = 0.5) +
theme_bw()
HLgof.test(fit = aug1$fit[!is.na(aug1$fit)], obs = aug1$withdraw[!is.na(aug1$fit)])
## $C
##
## Hosmer-Lemeshow C statistic
##
## data: aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 51.488, df = 8, p-value = 2.113e-08
##
##
## $H
##
## Hosmer-Lemeshow H statistic
##
## data: aug1$fit[!is.na(aug1$fit)] and aug1$withdraw[!is.na(aug1$fit)]
## X-squared = 254.64, df = 8, p-value < 2.2e-16
tidyB <- tidyB %>% mutate(semester = "B")
tidyJ <- tidyJ %>% mutate(semester = "J")
# Differences of betas
tidy.compare.beta <- bind_rows(tidyB, tidyJ) %>% dplyr::select(term, estimate,
semester) %>% spread(key = semester, value = estimate) %>% mutate(diff.estimate = B -
J)
# Uncertainty of difference
tidy.compare.std <- bind_rows(tidyB, tidyJ) %>% dplyr::select(term, std.error,
semester) %>% spread(key = semester, value = std.error) %>% group_by(term) %>%
mutate(diff.std = sqrt(B^2 + J^2)) %>% dplyr::select(-B, -J)
# Combine and CI
tidy.compare <- tidy.compare.beta %>% left_join(tidy.compare.std) %>% mutate(diff.estimate = diff.estimate/diff.std) %>%
mutate(conf.low = diff.estimate - 1.96 * diff.std, conf.high = diff.estimate +
1.96 * diff.std)
ggplot(tidy.compare, aes(x = term, y = diff.estimate)) + geom_point() + geom_errorbar(aes(ymin = conf.low,
ymax = conf.high), width = 0.5) + theme_bw() + geom_hline(yintercept = 0,
linetype = "dotted") + coord_flip() + labs(y = "B - J") + ggtitle("Differences between semesters")
Now I will study what factors influence the total score obtained with assignments and exams.
It is slightly skewed.
hist(combine.feat.unique$comb.score)
m <- lm(comb.score ~ gender + region + highest_education + studied_credits.std +
disability + code_module + semester + year + total.clicks.std + num_of_prev_attempts_dich +
final_result, data = combine.feat.unique)
car::vif(m)
par(mfrow = c(2, 2))
plot(m)
library(MASS)
qqnorm(combine.feat.unique$comb.score,
ylab="Quantiles for comb.score")
qqline(combine.feat.unique$comb.score,
col="red")
Box = boxcox(combine.feat.unique$comb.score + 1 ~ 1, # Transform Turbidity as a single vector
lambda = seq(-6,6,0.1))
Cox = data.frame(Box$x, Box$y) # Create a data frame with the results
Cox2 = Cox[with(Cox, order(-Cox$Box.y)),] # Order the new data frame by decreasing y
Cox2[1,] # Display the lambda with the greatest
# log likelihood
lambda = Cox2[1, "Box.x"] # Extract that lambda
combine.feat.unique$comb.score.box = (combine.feat.unique$comb.score ^ lambda - 1)/lambda # Transform the original data
combine.feat.unique$comb.score.box.st <- scale(combine.feat.unique$comb.score.box,center = TRUE,scale = TRUE)
qqnorm(combine.feat.unique$comb.score.box,
ylab="Quantiles for comb.score")
qqline(combine.feat.unique$comb.score.box,
col="red")
m <- lm(comb.score.std ~ gender + region + highest_education + studied_credits.std +
disability + code_module + semester + year + total.clicks.std + num_of_prev_attempts_dich,
data = combine.feat.unique)
par(mfrow = c(2, 4))
plot(m)
# After box-cox
m <- lm(comb.score.box.st ~ gender + region + highest_education + studied_credits.std +
disability + code_module + semester + year + total.clicks.std + num_of_prev_attempts_dich,
data = combine.feat.unique)
plot(combine.feat.unique$comb.score.box.st, combine.feat.unique$comb.score)
plot(m)
car::vif(m)
tidym <- tidy(m, conf.int = TRUE)
tidym <- tidym %>% mutate(term = factor(term, levels = term)) %>% filter(term !=
"(Intercept)")
ggplot(tidym, aes(x = term, y = estimate, colour = -log10(p.value))) + geom_point() +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high)) + theme_bw() + geom_hline(yintercept = 0,
linetype = "dotted") + coord_flip() + ggtitle("Factors associated with total score")
m <- lm(comb.score.std ~ gender + region + highest_education + studied_credits.std +
disability + code_module + year + total.clicks.std + num_of_prev_attempts_dich,
data = combine.feat.unique[combine.feat.unique$semester %in% "B", ])
par(mfrow = c(2, 4))
plot(m)
# After box-cox
m <- lm(comb.score.box.st ~ gender + region + highest_education + studied_credits.std +
disability + code_module + year + total.clicks.std + num_of_prev_attempts_dich,
data = combine.feat.unique[combine.feat.unique$semester %in% "B", ])
par(mfrow = c(2, 2))
plot(m)
car::vif(m)
aug <- augment(m)
pairs(aug[, c(".resid", ".fitted", "total.clicks.std", ".std.resid", "studied_credits.std",
"comb.score.box.st")])
ggplot(aug, aes(x = code_module, y = .resid, fill = region)) + geom_boxplot()
# After box-cox
m <- lm(comb.score.box.st ~ gender + highest_education + total.clicks.std +
num_of_prev_attempts_dich, data = combine.feat.unique[combine.feat.unique$semester %in%
"B", ])
summary(m)
par(mfrow = c(2, 2))
plot(m)
aug <- augment(m)
pairs(aug[, c(".resid", ".fitted", "total.clicks.std", ".std.resid", "comb.score.box.st")])
m <- lm(comb.score.std ~ gender + region + highest_education + studied_credits.std +
disability + code_module + year + total.clicks.std + num_of_prev_attempts_dich,
data = combine.feat.unique[combine.feat.unique$semester %in% "J", ])
summary(m)
par(mfrow = c(2, 2))
plot(m)
# After box-cox
m <- lm(comb.score.box.st ~ gender + highest_education + studied_credits.std +
disability + code_module + year + total.clicks.std + num_of_prev_attempts_dich,
data = combine.feat.unique[combine.feat.unique$semester %in% "J", ])
par(mfrow = c(2, 2))
plot(m)
summary(m)
car::vif(m)
aug <- augment(m)
pairs(aug[, c(".resid", ".fitted", "total.clicks.std", ".std.resid", "studied_credits.std",
"comb.score.box.st")])
knitr::purl("nous-student.Rmd", output = "nous-student.R", documentation = 2)